import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
data = pd.read_csv(r"C:\Users\Iddrisu Bachokun\Desktop\Python\credit_risk_analysis\credit_risk.csv")
data.head()
| Id | Age | Income | Home | Emp_length | Intent | Amount | Rate | Status | Percent_income | Default | Cred_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 22 | 59000 | RENT | 123.0 | PERSONAL | 35000 | 16.02 | 1 | 0.59 | Y | 3 |
| 1 | 1 | 21 | 9600 | OWN | 5.0 | EDUCATION | 1000 | 11.14 | 0 | 0.10 | N | 2 |
| 2 | 2 | 25 | 9600 | MORTGAGE | 1.0 | MEDICAL | 5500 | 12.87 | 1 | 0.57 | N | 3 |
| 3 | 3 | 23 | 65500 | RENT | 4.0 | MEDICAL | 35000 | 15.23 | 1 | 0.53 | N | 2 |
| 4 | 4 | 24 | 54400 | RENT | 8.0 | MEDICAL | 35000 | 14.27 | 1 | 0.55 | Y | 4 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32581 entries, 0 to 32580 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 32581 non-null int64 1 Age 32581 non-null int64 2 Income 32581 non-null int64 3 Home 32581 non-null object 4 Emp_length 31686 non-null float64 5 Intent 32581 non-null object 6 Amount 32581 non-null int64 7 Rate 29465 non-null float64 8 Status 32581 non-null int64 9 Percent_income 32581 non-null float64 10 Default 32581 non-null object 11 Cred_length 32581 non-null int64 dtypes: float64(3), int64(6), object(3) memory usage: 3.0+ MB
data.describe()
| Id | Age | Income | Emp_length | Amount | Rate | Status | Percent_income | Cred_length | |
|---|---|---|---|---|---|---|---|---|---|
| count | 32581.000000 | 32581.000000 | 3.258100e+04 | 31686.000000 | 32581.000000 | 29465.000000 | 32581.000000 | 32581.000000 | 32581.000000 |
| mean | 16290.006139 | 27.734600 | 6.607485e+04 | 4.789686 | 9589.371106 | 11.011695 | 0.218164 | 0.170203 | 5.804211 |
| std | 9405.479594 | 6.348078 | 6.198312e+04 | 4.142630 | 6322.086646 | 3.240459 | 0.413006 | 0.106782 | 4.055001 |
| min | 0.000000 | 20.000000 | 4.000000e+03 | 0.000000 | 500.000000 | 5.420000 | 0.000000 | 0.000000 | 2.000000 |
| 25% | 8145.000000 | 23.000000 | 3.850000e+04 | 2.000000 | 5000.000000 | 7.900000 | 0.000000 | 0.090000 | 3.000000 |
| 50% | 16290.000000 | 26.000000 | 5.500000e+04 | 4.000000 | 8000.000000 | 10.990000 | 0.000000 | 0.150000 | 4.000000 |
| 75% | 24435.000000 | 30.000000 | 7.920000e+04 | 7.000000 | 12200.000000 | 13.470000 | 0.000000 | 0.230000 | 8.000000 |
| max | 32780.000000 | 144.000000 | 6.000000e+06 | 123.000000 | 35000.000000 | 23.220000 | 1.000000 | 0.830000 | 30.000000 |
data.isna().sum()
Id 0 Age 0 Income 0 Home 0 Emp_length 895 Intent 0 Amount 0 Rate 3116 Status 0 Percent_income 0 Default 0 Cred_length 0 dtype: int64
data['Default'].unique()
array(['Y', 'N'], dtype=object)
data['Cred_length'].unique()
array([ 3, 2, 4, 8, 7, 6, 9, 10, 5, 11, 16, 15, 12, 13, 17, 14, 25,
28, 27, 22, 19, 29, 23, 26, 20, 21, 30, 24, 18], dtype=int64)
data['Percent_income'].unique()
array([0.59, 0.1 , 0.57, 0.53, 0.55, 0.25, 0.45, 0.44, 0.42, 0.16, 0.41,
0.37, 0.32, 0.3 , 0.06, 0.29, 0.31, 0.22, 0.52, 0.14, 0.49, 0.13,
0.5 , 0.35, 0.17, 0.27, 0.33, 0.08, 0.03, 0.21, 0.63, 0.47, 0.4 ,
0.07, 0.38, 0.34, 0.04, 0.23, 0.15, 0.11, 0.43, 0.51, 0.02, 0.28,
0.26, 0.19, 0.39, 0.09, 0.05, 0.61, 0.18, 0.6 , 0.01, 0.48, 0.12,
0.54, 0.56, 0.46, 0.36, 0.24, 0.2 , 0.72, 0.64, 0.69, 0.77, 0.83,
0.65, 0.67, 0.58, 0.71, 0.68, 0.7 , 0.66, 0. , 0.76, 0.62, 0.78])
data['Status'].unique()
array([1, 0], dtype=int64)
data['Rate'].isna().sum()
3116
data['Amount'].isna().sum()
0
data['Intent'].unique()
array(['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT',
'DEBTCONSOLIDATION'], dtype=object)
data['Emp_length'].isna().sum()
895
df = data[["Age","Income","Home","Emp_length","Intent","Amount","Rate","Status","Percent_income","Default","Cred_length"]].dropna()
df["Deft"] = df.Default # reindexing to chane tthe posiion to the last
df.head()
| Age | Income | Home | Emp_length | Intent | Amount | Rate | Status | Percent_income | Default | Cred_length | Deft | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22 | 59000 | RENT | 123.0 | PERSONAL | 35000 | 16.02 | 1 | 0.59 | Y | 3 | Y |
| 1 | 21 | 9600 | OWN | 5.0 | EDUCATION | 1000 | 11.14 | 0 | 0.10 | N | 2 | N |
| 2 | 25 | 9600 | MORTGAGE | 1.0 | MEDICAL | 5500 | 12.87 | 1 | 0.57 | N | 3 | N |
| 3 | 23 | 65500 | RENT | 4.0 | MEDICAL | 35000 | 15.23 | 1 | 0.53 | N | 2 | N |
| 4 | 24 | 54400 | RENT | 8.0 | MEDICAL | 35000 | 14.27 | 1 | 0.55 | Y | 4 | Y |
df = df[["Age","Income","Home","Emp_length","Intent","Amount","Rate","Status","Percent_income","Cred_length","Deft"]]
df.head()
| Age | Income | Home | Emp_length | Intent | Amount | Rate | Status | Percent_income | Cred_length | Deft | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22 | 59000 | RENT | 123.0 | PERSONAL | 35000 | 16.02 | 1 | 0.59 | 3 | Y |
| 1 | 21 | 9600 | OWN | 5.0 | EDUCATION | 1000 | 11.14 | 0 | 0.10 | 2 | N |
| 2 | 25 | 9600 | MORTGAGE | 1.0 | MEDICAL | 5500 | 12.87 | 1 | 0.57 | 3 | N |
| 3 | 23 | 65500 | RENT | 4.0 | MEDICAL | 35000 | 15.23 | 1 | 0.53 | 2 | N |
| 4 | 24 | 54400 | RENT | 8.0 | MEDICAL | 35000 | 14.27 | 1 | 0.55 | 4 | Y |
df.isna().sum()
Age 0 Income 0 Home 0 Emp_length 0 Intent 0 Amount 0 Rate 0 Status 0 Percent_income 0 Cred_length 0 Deft 0 dtype: int64
for label in df.columns[:-1]:
plt.hist(df[df['Deft']=='Y'][label],color = "red",label="Defaulted",alpha=.7,density=True)
plt.hist(df[df['Deft']=='N'][label],color = "green",label="Defaulted",alpha=0.7,density=True)
plt.title(label)
plt.xlabel(label)
plt.ylabel("probability")
plt.legend()
plt.show()
fig = px.box(df, x= "Deft",
color = "Deft",
y ="Age",
title = "Loan Default status based on Age",
color_discrete_map ={"Y":"red",
"N":'green'})
fig.update_traces(quartilemethod='exclusive')
fig = px.box(df, x= "Deft",
color = "Deft",
y ="Income",
title = "Loan Default status based on Income",
color_discrete_map ={"Y":"red",
"N":'green'})
fig.update_traces(quartilemethod='exclusive')
fig = px.box(df, x= "Deft",
color = "Deft",
y ="Home",
title = "Loan Default status based on Home",
color_discrete_map ={"Y":"red",
"N":'green'})
fig.update_traces(quartilemethod='exclusive')
fig = px.box(df, x= "Deft",
color = "Deft",
y ="Emp_length",
title = "Loan Default status based on Employment",
color_discrete_map ={"Y":"red",
"N":'green'})
fig.update_traces(quartilemethod='exclusive')
fig
home= df["Home"].value_counts()
transactions = home.index
quantity = home.values
fig = px.pie(df,
values = quantity,
names = transactions,
title = "Debter's type of home")
fig.show()
emp= df["Emp_length"].value_counts()
transactions = emp.index
quantity = emp.values
fig = px.pie(df,
values = quantity,
names = transactions,
title = "Debter's type of Employment status")
fig.show()
intent= df["Intent"].value_counts()
transactions = intent.index
quantity = intent.values
fig = px.pie(df,
values = quantity,
names = transactions,
title = "Debter's type of intent")
fig.show()
status= df["Status"].value_counts()
transactions = status.index
quantity = status.values
fig = px.pie(df,
values = quantity,
names = transactions,
title = "Debter's type of Status")
fig.show()
df["Deft"]=df["Deft"].map({"Y":1, 'N':0})
df["Home"] = df["Home"].map({"RENT":1,"MORTGAGE":2,"OWN":3,"OTHER":4})
df["Intent"] =df["Intent"].map({'PERSONAL':1,'EDUCATION':2,'MEDICAL':3,'VENTURE':4,'HOMEIMPROVEMENT':5,'DEBTCONSOLIDATION':6})
df.head()
| Age | Income | Home | Emp_length | Intent | Amount | Rate | Status | Percent_income | Cred_length | Deft | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22 | 59000 | 1 | 123.0 | 1 | 35000 | 16.02 | 1 | 0.59 | 3 | 1 |
| 1 | 21 | 9600 | 3 | 5.0 | 2 | 1000 | 11.14 | 0 | 0.10 | 2 | 0 |
| 2 | 25 | 9600 | 2 | 1.0 | 3 | 5500 | 12.87 | 1 | 0.57 | 3 | 0 |
| 3 | 23 | 65500 | 1 | 4.0 | 3 | 35000 | 15.23 | 1 | 0.53 | 2 | 0 |
| 4 | 24 | 54400 | 1 | 8.0 | 3 | 35000 | 14.27 | 1 | 0.55 | 4 | 1 |
from sklearn.preprocessing import StandardScaler
train,test,valid = np.split(df.sample(frac=1),[int(0.6*len(df)),int(0.8*len(df))])
def scle_dataset(dataframe):
x = dataframe[dataframe.cols[:-1]].values
y = dataframe[dataframe.col[-1]].values
scaler = StandardScaler()
x = scaler.fit_transform(x)
if oversample:
ros = RandomOverSampler()
x ,y = ros.fit_resample(x,y)
data =np.hsatck((x,np.resahpe(y,(-1,1))))
return data ,x,y
print(len(train[train["Deft"]==0]))
14162
print(len(train[train["Deft"]==1]))
3020
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
def scale_dataset(dataframe,oversample=False):
x = dataframe[dataframe.columns[:-1]].values
y = dataframe[dataframe.columns[-1]].values
scaler = StandardScaler()
x = scaler.fit_transform(x)
if oversample:
ros = RandomOverSampler()
x ,y = ros.fit_resample(x,y)
data =np.hstack((x,np.reshape(y,(-1,1))))
return data ,x,y
train, xtrain, ytrain = scale_dataset(train, oversample=True)
valid, xvalid ,yvalid = scale_dataset(valid, oversample=False)
test, xtest, ytest = scale_dataset(test, oversample=False)
print(len(ytrain==0))
28324
print(len(ytrain==1))
28324
from sklearn.metrics import classification_report
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(xtrain,ytrain)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
y_pred = svm_model.predict(xtest)
print(classification_report(ytest,y_pred))
precision recall f1-score support
0 0.98 0.76 0.86 4688
1 0.46 0.93 0.62 1040
accuracy 0.79 5728
macro avg 0.72 0.84 0.74 5728
weighted avg 0.89 0.79 0.81 5728
input_data = (23,65500,1,4.0,3,35000,15.23,1,0.53,2)
input_data_np = np.asarray(input_data)
imput_data_re = input_data_np.reshape(1,-1)
pred = svm_model.predict(imput_data_re)
print(pred)
if(pred[0]==0):
print("The debtor will not default payment")
else:
print("The debtor will default payment")
[0] The debtor will not default payment
we see that our model is predicting corectly as expected
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(xtrain,ytrain)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
y_pred = knn_model.predict(xtest)
print(classification_report(ytest,y_pred))
precision recall f1-score support
0 0.93 0.77 0.84 4688
1 0.41 0.73 0.52 1040
accuracy 0.76 5728
macro avg 0.67 0.75 0.68 5728
weighted avg 0.83 0.76 0.78 5728
input_data = (24,54400,1,8.0,3,35000,14.27,1,0.55,4)
input_data_np = np.asarray(input_data)
imput_data_re = input_data_np.reshape(1,-1)
pred = knn_model.predict(imput_data_re)
print(pred)
if(pred[0]==0):
print("The debtor will not default payment")
else:
print("The debtor will default payment")
[1] The debtor will default payment